Let’s quickly explore the dimensionality and the behaviors of different categorical features present in the dataset. But before that we will try to import the required libraries for our study.

library(tidyverse)
## -- Attaching packages ------------------------------------------ tidyverse 1.2.1 --
## v ggplot2 2.2.1     v purrr   0.2.5
## v tibble  1.4.2     v dplyr   0.7.8
## v tidyr   0.8.2     v stringr 1.2.0
## v readr   1.1.1     v forcats 0.2.0
## Warning: package 'tibble' was built under R version 3.4.3
## Warning: package 'tidyr' was built under R version 3.4.4
## Warning: package 'purrr' was built under R version 3.4.4
## Warning: package 'dplyr' was built under R version 3.4.4
## -- Conflicts --------------------------------------------- tidyverse_conflicts() --
## x dplyr::filter() masks stats::filter()
## x dplyr::lag()    masks stats::lag()
library(grid)
library(gridExtra)
## 
## Attaching package: 'gridExtra'
## The following object is masked from 'package:dplyr':
## 
##     combine

Assigning train and test data to variables.

train = read_csv('train.csv')
## Parsed with column specification:
## cols(
##   .default = col_integer(),
##   Id = col_character(),
##   v2a1 = col_double(),
##   idhogar = col_character(),
##   dependency = col_character(),
##   edjefe = col_character(),
##   edjefa = col_character(),
##   meaneduc = col_double(),
##   overcrowding = col_double(),
##   SQBovercrowding = col_double(),
##   SQBdependency = col_double(),
##   SQBmeaned = col_double()
## )
## See spec(...) for full column specifications.
test = read_csv('test.csv')
## Parsed with column specification:
## cols(
##   .default = col_integer(),
##   Id = col_character(),
##   v2a1 = col_double(),
##   idhogar = col_character(),
##   dependency = col_character(),
##   edjefe = col_character(),
##   edjefa = col_character(),
##   meaneduc = col_double(),
##   overcrowding = col_double(),
##   SQBovercrowding = col_double(),
##   SQBdependency = col_double(),
##   SQBmeaned = col_double()
## )
## See spec(...) for full column specifications.

A quick

head(train)
## # A tibble: 6 x 143
##   Id      v2a1 hacdor rooms hacapo  v14a refrig  v18q v18q1  r4h1  r4h2
##   <chr>  <dbl>  <int> <int>  <int> <int>  <int> <int> <int> <int> <int>
## 1 ID_2~ 190000      0     3      0     1      1     0    NA     0     1
## 2 ID_f~ 135000      0     4      0     1      1     1     1     0     1
## 3 ID_6~     NA      0     8      0     1      1     0    NA     0     0
## 4 ID_d~ 180000      0     5      0     1      1     1     1     0     2
## 5 ID_d~ 180000      0     5      0     1      1     1     1     0     2
## 6 ID_e~ 180000      0     5      0     1      1     1     1     0     2
## # ... with 132 more variables: r4h3 <int>, r4m1 <int>, r4m2 <int>,
## #   r4m3 <int>, r4t1 <int>, r4t2 <int>, r4t3 <int>, tamhog <int>,
## #   tamviv <int>, escolari <int>, rez_esc <int>, hhsize <int>,
## #   paredblolad <int>, paredzocalo <int>, paredpreb <int>, pareddes <int>,
## #   paredmad <int>, paredzinc <int>, paredfibras <int>, paredother <int>,
## #   pisomoscer <int>, pisocemento <int>, pisoother <int>, pisonatur <int>,
## #   pisonotiene <int>, pisomadera <int>, techozinc <int>,
## #   techoentrepiso <int>, techocane <int>, techootro <int>,
## #   cielorazo <int>, abastaguadentro <int>, abastaguafuera <int>,
## #   abastaguano <int>, public <int>, planpri <int>, noelec <int>,
## #   coopele <int>, sanitario1 <int>, sanitario2 <int>, sanitario3 <int>,
## #   sanitario5 <int>, sanitario6 <int>, energcocinar1 <int>,
## #   energcocinar2 <int>, energcocinar3 <int>, energcocinar4 <int>,
## #   elimbasu1 <int>, elimbasu2 <int>, elimbasu3 <int>, elimbasu4 <int>,
## #   elimbasu5 <int>, elimbasu6 <int>, epared1 <int>, epared2 <int>,
## #   epared3 <int>, etecho1 <int>, etecho2 <int>, etecho3 <int>,
## #   eviv1 <int>, eviv2 <int>, eviv3 <int>, dis <int>, male <int>,
## #   female <int>, estadocivil1 <int>, estadocivil2 <int>,
## #   estadocivil3 <int>, estadocivil4 <int>, estadocivil5 <int>,
## #   estadocivil6 <int>, estadocivil7 <int>, parentesco1 <int>,
## #   parentesco2 <int>, parentesco3 <int>, parentesco4 <int>,
## #   parentesco5 <int>, parentesco6 <int>, parentesco7 <int>,
## #   parentesco8 <int>, parentesco9 <int>, parentesco10 <int>,
## #   parentesco11 <int>, parentesco12 <int>, idhogar <chr>,
## #   hogar_nin <int>, hogar_adul <int>, hogar_mayor <int>,
## #   hogar_total <int>, dependency <chr>, edjefe <chr>, edjefa <chr>,
## #   meaneduc <dbl>, instlevel1 <int>, instlevel2 <int>, instlevel3 <int>,
## #   instlevel4 <int>, instlevel5 <int>, instlevel6 <int>,
## #   instlevel7 <int>, ...
train$Target[train$Target == 1] = 'Extreme Poverty'
train$Target[train$Target == 2] = 'Moderate Poverty'
train$Target[train$Target == 3] = 'Vulnerable Household'
train$Target[train$Target == 4] = 'Non-Vulnerable Household'

train %>%
ggplot(aes(Target))+
geom_bar(color = 'black', fill = 'tomato')+
xlab("Target Classes")+
ylab("Target Classes Count")

train %>%
ggplot(aes(x = as.factor(v18q)))+
geom_bar(colour = "black", fill = "blue")+
facet_wrap(~Target, scales = "free", ncol = 3)+ 
theme(axis.text.x = element_text(hjust = 1, size = 11))+
theme(strip.text = element_text(size = 10, face = "bold"))+
labs(x = "Tablet Owner", y = "Number of Households")+
scale_x_discrete(labels = c("No", "Yes"))

Majority of the households do not own a tablet but many do. And the interesting thing is that people in extreme poverty households own a tablet! Lets look at how they fare when compared together.

train %>%
ggplot(aes(as.factor(v18q)))+
geom_bar(aes(fill = as.factor(Target)), position = "dodge", color = "grey")+
labs(x = "Tablet Owner", y = "Number of Households")+
guides(fill = guide_legend('Household Type'))+
theme(axis.text.x = element_text(hjust = 1, size = 10))+
scale_x_discrete(labels = c("No", "Yes"))

Now lets look at the number of tablets owned by households which do own a tablet

train %>%
filter(!is.na(v18q1)) %>%
ggplot(aes(as.factor(v18q1)))+
geom_bar(color = "blue")
## Warning: package 'bindrcpp' was built under R version 3.4.4

So the maximum number of tablets owned is 1 but households do own 6 tablets

train %>%
filter(!is.na(v18q1)) %>%
ggplot(aes(x = as.factor(v18q1)))+
geom_bar(colour = "black", fill = "green")+
facet_wrap(~Target, scales = "free", ncol = 3)+ 
theme(axis.text.x = element_text(hjust = 1, size = 10))+
theme(strip.text = element_text(size = 9, face = "bold"))+
labs(x = "Number of tablets owned", y = "Number of Households")

train %>%
filter(!is.na(v18q1)) %>%
ggplot(aes(as.factor(v18q1)))+
geom_bar(aes(fill = as.factor(Target)), position = "dodge", color = "black")+
labs(x = "Owns a tablet or not?", y = "Count")+
guides(fill = guide_legend('Household Type'))+
theme(axis.text.x = element_text(hjust = 1, size = 9))

Male Distribution

train %>%
ggplot(aes(as.factor(r4h1)))+
geom_bar(color = "grey", fill = "red", alpha = 0.5)+
labs(x = "Males younger than 12", "Number of Households")

train %>%
ggplot(aes(as.factor(r4h2)))+
geom_bar(color = "grey", fill = "black", alpha = 0.6)+
labs(x = "Males older than 12", "Number of Households")

train %>%
ggplot(aes(x = as.factor(Target)))+
geom_bar(colour = "black", fill = "red", alpha = 0.6)+
facet_wrap(~as.factor(r4h1), scales = "free", ncol = 3)+ 
theme(axis.text.x = element_text(angle = 45, hjust = 1, size = 8))+
theme(strip.text = element_text(size = 9, face = "bold"))+
labs(title = "Association between Household Type and number of males younger than 12", x = "Household Type", y = "Number of Households")

We see that the number of males younger than 12 are high in extreme poverty and non-vulnerable households.

#ERROR IN EXECUTION. REVISIT THIS
train %>%
ggplot(aes(x = as.factor(Target)))+
geom_bar(colour = "grey19", fill = "maroon", alpha = 0.6)+
facet_wrap(~as.factor(r4h2), scales = "free", ncol = 3)+ 
theme(axis.text.x = element_text(angle = 15, hjust = 1, size = 7))+
theme(strip.text = element_text(size = 9, face = "bold"))+
labs(title = "Association between Household Type and number of males older than 12 ", x = "Household Type", y = "Number of Households")

The number of data points concerning non-vulnerable households is very large. We can remove some datapoints and analyze them to understand the relationship of low income households and compare them

train %>%
filter(Target != "Non-Vulnerable Household") %>%
ggplot(aes(as.factor(r4h2)))+
geom_bar(aes(fill = as.factor(Target)), position = "dodge", color = "grey", alpha = 0.5)+
labs(x = "Number of Males older than 12", y = "Number of Households")+
guides(fill = guide_legend('Household Type'))+
theme(axis.text.x = element_text(hjust = 1, size = 9))

This reveals an interesting property about the extreme poverty household. There is a high population of males younger than 12 in extreme poverty households while the concentration of males older than 12 is lesser.

train %>%
ggplot(aes(as.factor(r4h3)))+
geom_bar(color = "black", fill = "gold", alpha = 0.6)+
labs(x = "Total Number of Males in Household", "Number of Households")

train %>%
ggplot(aes(x = as.factor(r4h3)))+
geom_bar(colour = "black", fill = "red", alpha = 0.5)+
facet_wrap(~as.factor(Target), scales = "free", ncol = 3)+ 
theme(axis.text.x = element_text(angle = 45, hjust = 1, size = 8))+
theme(strip.text = element_text(face = "bold",size = 10 ))+
labs(title = "Association between Household Type and total number of males", x = "Type of Household", y = "Number of Households")

Female Distribution

train %>%
ggplot(aes(as.factor(r4m1)))+
geom_bar(color = "black", fill = "red", alpha = 0.5)+
labs(x = "Females younger than 12", "Number of Households")

train %>%
ggplot(aes(as.factor(r4m2)))+
geom_bar(color = "black", fill = "blue", alpha = 0.5)+
labs(x = "Females older than 12", "Number of Households")

train %>%
ggplot(aes(x = as.factor(Target)))+
geom_bar(colour = "black", fill = "red", alpha = 0.6)+
facet_wrap(~as.factor(r4m1), scales = "free", ncol = 3)+ 
theme(axis.text.x = element_text(angle = 45, hjust = 1, size = 8))+
theme(strip.text = element_text(size = 9, face = "bold"))+
labs(title = "Association between Household Type and number of females younger than 12", x = "Household Type", y = "Number of Households")

train %>%
ggplot(aes(x = as.factor(Target)))+
geom_bar(colour = "grey19", fill = "maroon", alpha = 0.6)+
facet_wrap(~as.factor(r4m2), scales = "free", ncol = 3)+ 
theme(axis.text.x = element_text(angle = 15, hjust = 1, size = 7))+
theme(strip.text = element_text(size = 9, face = "bold"))+
labs(title = "Association between Household Type and number of females older than 12", x = "Household Type", y = "Number of Households")

train %>%
ggplot()+
geom_bar(aes(x = as.factor(r4m1)), colour = "black", fill = "red", alpha = 0.5)+
geom_bar(aes(x = as.factor(r4m2)), colour = "black", fill = "blue", alpha = 0.25)+
facet_wrap(~as.factor(Target), scales = "free", ncol = 3)+ 
theme(axis.text.x = element_text(angle = 45, hjust = 1, size = 8))+
theme(strip.text = element_text(size = 9, face = "bold"))+
labs(title = "Association between Household Type and number of females of different ages", x = "Type of Household", y = "Number of Households")

There is not a huge population of females younger than 12 in majority of the households. However, the number of females older than 12 are 1 in a majority of the households with the number going upto 5 in households with extreme poverty

train %>%
filter(Target != "Non-Vulnerable Household") %>%
ggplot(aes(as.factor(r4m2)))+
geom_bar(aes(fill = as.factor(Target)), position = "dodge", color = "grey", alpha = 0.5)+
labs(x = "Females older than 12", y = "Number of Households")+
guides(fill = guide_legend('Household Type'))+
theme(axis.text.x = element_text(hjust = 1, size = 9))

train %>%
filter(Target != "Non-Vulnerable Household") %>%
ggplot(aes(as.factor(r4m1)))+
geom_bar(aes(fill = as.factor(Target)), position = "dodge", color = "grey", alpha = 0.5)+
labs(x = "Females younger than 12", y = "Number of Households")+
guides(fill = guide_legend('Household Type'))+
theme(axis.text.x = element_text(hjust = 1, size = 9))

Females younger than 12 seem to be present majorly in moderate poverty households

train %>%
ggplot(aes(as.factor(r4m3)))+
geom_bar(color = "grey", fill = "red", alpha = 0.6)+
labs(x = "Total Number of Females in the Household", "Number of Households")

train %>%
ggplot(aes(x = as.factor(r4m3)))+
geom_bar(colour = "black", fill = "blue", alpha = 0.5)+
facet_wrap(~as.factor(Target), scales = "free", ncol = 3)+ 
theme(axis.text.x = element_text(angle = 45, size = 8, hjust = 1))+
theme(strip.text = element_text(size = 9, face = "bold"))+
labs(title = "Association between Type of Household and total number of females ", x = "Household Type", y = "Number of Households")

8. Person Distribution

train %>%
ggplot(aes(as.factor(r4t1)))+
geom_bar(color = "grey", fill = "red", alpha = 0.6)+
labs(x = "Persons younger than 12", "Number of Households")

train %>%
ggplot(aes(as.factor(r4t2)))+
geom_bar(color = "grey", fill = "blue", alpha = 0.5)+
labs(x = "Persons older than 12", "Number of Households")

train %>%
ggplot()+
geom_bar(aes(as.factor(r4t1)), colour = "black", fill = "green", alpha = 0.5)+
facet_wrap(~as.factor(Target), scales = "free", ncol = 3)+ 
theme(axis.text.x = element_text(angle = 45, hjust = 1, size = 8))+
theme(strip.text = element_text(size = 9, face = "bold"))+
labs(title = "Association between Type of Households and number of persons younger than 12 ", x = "Household Type", y = "Number of Households")

Moderate poverty and extreme poverty households have a huge number of persons younger than 12 whereas vulnerable and non-vulnerable households have a lesser number of persons age less than 12

train %>%
ggplot()+
geom_bar(aes(as.factor(r4t2)), colour = "black", fill = "green", alpha = 0.5)+
facet_wrap(~as.factor(Target), scales = "free", ncol = 3)+ 
theme(axis.text.x = element_text(angle = 45, hjust = 1, size = 8))+
theme(strip.text = element_text(size = 9, face = "bold"))+
labs(title = "Association between Type of Households and number of persons older than 12 ", x = "Household Type", y = "Number of Households")

There is a reversal of results here with the number of persons older than 12 being higher in non-vulnerable and vulnerable households.

train %>%
mutate(
    mean = r4t3/rooms
) %>%
ggplot(aes(x = mean))+
geom_histogram(colour = "black", fill = "blue", bins = 30, alpha = 0.4)+ 
facet_wrap(~Target, scales = "free", ncol = 3)+ 
theme(axis.text.x = element_text(hjust = 1, size = 12))+
theme(strip.text = element_text(face = "bold", size = 10))+
labs(title =" Mean Number of Persons vs Rooms", x = "Mean number of persons", y = "Number of Households", size = 12)

The number of persons in a room goes up as the households get poorer.

  1. Number of Years of Schooling
train %>%
filter(Target == "Extreme Poverty") %>%
ggplot(aes(x = as.factor(Target), stat = 'bin', fill = as.factor(escolari)))+
geom_bar(position = 'fill', color = 'black')+
coord_polar("y")+
labs(x = "", y = "")+
guides(fill = guide_legend("Years of schooling"))

train %>%
filter(Target == "Moderate Poverty") %>%
ggplot(aes(x = as.factor(Target), stat = 'bin', fill = as.factor(escolari)))+
geom_bar(position = 'fill', color = 'black')+
coord_polar("y")+
labs(x = "", y = "")+
guides(fill = guide_legend("Years of schooling"))

train %>%
filter(Target == "Vulnerable Household") %>%
ggplot(aes(x = as.factor(Target), stat = 'bin', fill = as.factor(escolari)))+
geom_bar(position = 'fill', color = 'black')+
coord_polar("y")+
labs(x = "", y = "")+
guides(fill = guide_legend("Years of schooling"))

train %>%
filter(Target == "Non-Vulnerable Household") %>%
ggplot(aes(x = as.factor(Target), stat = 'bin', fill = as.factor(escolari)))+
geom_bar(position = 'fill', color = 'black')+
coord_polar("y")+
labs(x = "", y = "")+
guides(fill = guide_legend("Years of schooling"))

Majority of persons in extreme poverty households have 0 years of households